#-------------------------------------------------------#
#   Quality Assessment of the Academic Freedom Index    #
#-------------------------------------------------------#

# Title: "Quality Assessment of the Academic Freedom Index: Strengths, Weaknesses, and How Best to Use It" 
# Authors: "Lott, Lars", "Spannagel, Janika"
# date: 2024-09-19
# journal: Perspectives on Politics
# DOI: TBA
# written under "R and RStudio (4.4.1)"

#### Preliminaries ####

R.version$version.string

# clear workspace
rm(list=ls())

# load packages
library(here)
library(tidyverse)
library(magrittr)
library(reshape2)
library(ggpubr)
library(countrycode)
library(ggrepel)
library(ggthemes)
library(lubridate)
library(estimatr)
library(texreg)
library(readxl)
library(haven)
library(scales)
library(dotwhisker)
library(ggeffects)
library(viridis)
library("Hmisc")
library(marginaleffects)

set.seed(1234)

#---------------------------------------------------#
#--------------------load data ---------------------#
#---------------------------------------------------#

#load coder-level data 
coder_level_ds <- read.csv(file.path("data/vdem_v13",  "vdem_13_coder_level", "coder_level_ds_v13.csv"))
gc()

#load coder characteristics

coder_characteristics <- readRDS(file.path("data\vdem_v13\coder_characteristics", "coder_characteristics/academzz_all.rds"))
# wrong main_country coder_ids in the coder_characteristics dataframe, correct them with the updated_main_country dataframe

updated_main_country <- readRDS(file.path("data\vdem_v13\coder_characteristics", "coder_characteristics/updated_main_country.rds"))

updated_main_country <- updated_main_country %>% dplyr::select(-code)

summary(updated_main_country)

attributes(coder_characteristics) 
coder_characteristics_wide <- coder_characteristics %>%
  pivot_wider(id_cols = coder_id, names_from = question, values_from = code)

summary(coder_characteristics_wide)

coder_characteristics_wide <- coder_characteristics_wide %>%
  dplyr::select(-c(main_country, reside)) %>%
  left_join(updated_main_country, by = "coder_id") %>%
  mutate(reside = ifelse(bin ==1, main_country_id, 0)) %>%
  rename(bin_reside = bin)

saveRDS(coder_characteristics_wide, file.path("data/coder_characteristics_wide.rds"))

summary(coder_characteristics_wide)
coder_ids <- unique(coder_characteristics_wide$coder_id)
saveRDS(coder_ids, "data/coder_ids.rds")

# load V-Dem dataset 
vdem_full_v13 <- readRDS(file.path("data/vdem_v13", "vdem_13_cy", "V-Dem-CY-Full+Others-v13.rds"))

#### prepare country_unit.rds for BFA and Merging with Posteriors ####
country_unit <- vdem_full_v13 %>%
  dplyr::select(country_text_id, country_id, year, project, historical_date) %>%
  mutate(project = ifelse(project == 0, "contemporary-only", 
                          ifelse(project == 1, "historical-only", 
                                 "overlap")))
country_unit$year <- as.integer(country_unit$year)
saveRDS(country_unit, "data/bfa/country_unit.rds")

gc()

#-------------------------------------------------------------------#
#-------Section 3.2 Expert Qualifications and Potential Biases -----#
#-------------------------------------------------------------------#

vdem_subset_v13 <- vdem_full_v13 %>%
  dplyr::select(country_id, country_text_id, year, historical_date, starts_with("v2xca_academ"))

list_var <- c("v2cafres", "v2cafexch", "v2cainsaut", "v2casurv", "v2clacfree")

number_of_coders <- coder_level_ds %>%
  mutate(historical_date = ymd(historical_date)) %>%
  left_join(vdem_subset_v13, by = c("country_id", "historical_date")) %>%
  drop_na(v2xca_academ) %>% # drop observations, where no AFI was aggregated 
  filter(!is.na(v2cafres) | !is.na(v2cafexch) | !is.na(v2cainsaut) | !is.na(v2casurv) | !is.na(v2clacfree)) %>%
  distinct(coder_id) %>%
  count()

print(number_of_coders)

coder_level_ds$year <- year(coder_level_ds$historical_date)

mean_number_of_coders <- coder_level_ds |>
  mutate(historical_date = ymd(historical_date)) %>%
  filter(!is.na(v2cafres) | !is.na(v2cafexch) | !is.na(v2cainsaut) | !is.na(v2casurv) | !is.na(v2clacfree)) |>
  filter(year >=1900) |>
  group_by(country_id, historical_date) |>
  distinct(coder_id)|>
  count()

summary(mean_number_of_coders$n)

mean_number_of_coders_afi <- coder_level_ds %>%
  mutate(historical_date = ymd(historical_date)) %>%
  left_join(vdem_full_v13 %>% dplyr::select(v2xca_academ, country_text_id, historical_date), 
            by=c("country_text_id", "historical_date")) %>%
  filter(!is.na(v2xca_academ)) %>%
  filter(!is.na(v2cafres) | !is.na(v2cafexch) | !is.na(v2cainsaut) | !is.na(v2casurv) | !is.na(v2clacfree)) %>%
  dplyr::select(coder_id, country_id, country_text_id, historical_date, v2xca_academ, v2cafres, v2cafexch, v2cainsaut,v2casurv, v2clacfree) %>%
  group_by(country_text_id, historical_date) %>%
  distinct(coder_id)%>%
  count()

mean_number_of_coders_afi$year <- year(mean_number_of_coders_afi$historical_date)
mean_number_of_coders_afi$ctryyear <- paste(mean_number_of_coders_afi$country_text_id, mean_number_of_coders_afi$year, sep = "-")

summary(mean_number_of_coders_afi$n)

# Qualify the notion that "typically more than five expert coders per indicator per country-year,"

sum(mean_number_of_coders_afi$n>=5) / nrow(mean_number_of_coders_afi)

#### Distinct Coders per indicator ####

mean_number_of_coders_v2cafres <- coder_level_ds %>%
  mutate(historical_date = ymd(historical_date)) %>%
  left_join(vdem_full_v13 %>% dplyr::select(v2xca_academ, country_text_id, historical_date), 
            by=c("country_text_id", "historical_date")) %>%
  filter(!is.na(v2xca_academ)) %>%
  filter(!is.na(v2cafres)) %>%
  dplyr::select(coder_id, country_id, country_text_id, historical_date, v2xca_academ, v2cafres, v2cafres, v2cainsaut,v2casurv, v2clacfree) %>%
  group_by(country_text_id, historical_date) %>%
  distinct(coder_id) %>%
  count()

summary(mean_number_of_coders_v2cafres$n)

mean_number_of_coders_v2cafexch <- coder_level_ds %>%
  mutate(historical_date = ymd(historical_date)) %>%
  left_join(vdem_full_v13 %>% dplyr::select(v2xca_academ, country_text_id, historical_date), 
            by=c("country_text_id", "historical_date")) %>%
  filter(!is.na(v2xca_academ)) %>%
  filter(!is.na(v2cafexch)) %>%
  dplyr::select(coder_id, country_id, country_text_id, historical_date, v2xca_academ, v2cafres, v2cafexch, v2cainsaut,v2casurv, v2clacfree) %>%
  group_by(country_text_id, historical_date) %>%
  distinct(coder_id) %>%
  count()

summary(mean_number_of_coders_v2cafexch$n)


mean_number_of_coders_v2cainsaut <- coder_level_ds %>%
  mutate(historical_date = ymd(historical_date)) %>%
  left_join(vdem_full_v13 %>% dplyr::select(v2xca_academ, country_text_id, historical_date), 
            by=c("country_text_id", "historical_date")) %>%
  filter(!is.na(v2xca_academ)) %>%
  filter(!is.na(v2cainsaut)) %>%
  dplyr::select(coder_id, country_id, country_text_id, historical_date, v2xca_academ, v2cafres, v2cafexch, v2cainsaut,v2casurv, v2clacfree) %>%
  group_by(country_text_id, historical_date) %>%
  distinct(coder_id) %>%
  count()

summary(mean_number_of_coders_v2cainsaut$n)


mean_number_of_coders_v2casurv <- coder_level_ds %>%
  mutate(historical_date = ymd(historical_date)) %>%
  left_join(vdem_full_v13 %>% dplyr::select(v2xca_academ, country_text_id, historical_date), 
            by=c("country_text_id", "historical_date")) %>%
  filter(!is.na(v2xca_academ)) %>%
  filter(!is.na(v2casurv)) %>%
  dplyr::select(coder_id, country_id, country_text_id, historical_date, v2xca_academ, v2cafres, v2cafexch, v2cainsaut,v2casurv, v2clacfree) %>%
  group_by(country_text_id, historical_date) %>%
  distinct(coder_id) %>%
  count()

summary(mean_number_of_coders_v2casurv$n)


mean_number_of_coders_v2clacfree <- coder_level_ds %>%
  mutate(historical_date = ymd(historical_date)) %>%
  left_join(vdem_full_v13 %>% dplyr::select(v2xca_academ, country_text_id, historical_date), 
            by=c("country_text_id", "historical_date")) %>%
  filter(!is.na(v2xca_academ)) %>%
  filter(!is.na(v2clacfree)) %>%
  dplyr::select(coder_id, country_id, country_text_id, historical_date, v2xca_academ, v2cafres, v2cafexch, v2cainsaut,v2casurv, v2clacfree) %>%
  group_by(country_text_id, historical_date) %>%
  distinct(coder_id) %>%
  count()

summary(mean_number_of_coders_v2clacfree$n)


#### Descriptive Table Coder Characteristics ####
#### Table 2 ####

# building data.frame with information from V-dem headquarter we received from them by mail (see paper for detailed description
df_coder_descriptive_statistics <- data.frame(
  Variable = c("Gender", "Gender", "Gender", "Age", "Age","Age","Age", "Reside in Main Country Coded", "Reside in Main Country Coded", "Reside in Main Country Coded",
               "Born in Main Country Coded", "Born in Main Country Coded", "Born in Main Country Coded", "Born and/or Reside in Main Country Coded",  "Born and/or Reside in Main Country Coded",  "Born and/or Reside in Main Country Coded", 
               "Education Level", "Education Level", "Education Level", "Education Level",
               "Government Employee", "Government Employee", "Government Employee"), 
  item  = c("Men", "Women", "unknown", "<= 34 years", ">34 years and <50 years", ">=50 years", "unknown", "Yes", "No", "unknown",  "Yes", "No", "unknown", "Yes", "No", "unknown",
            "PhD", "Master's degree", "No PhD or master's degree", "unknown", "Yes", "No", "unknown"), 
  n = c(1165, 465, 567, 146, 795, 680, 567, 944, 716, 537, 1134, 530, 533, 1247, 419, 531, 
        1221, 336, 107, 533, 1936, 67, 194)
)

df_coder_descriptive_statistics <- df_coder_descriptive_statistics %>%
  mutate( 
          percentage = n/2197, 
          percentage = scales::percent(percentage, accuracy = 0.1), 
          n = as.character(n),)

modelsummary::datasummary_df(df_coder_descriptive_statistics)

modelsummary::datasummary_df(df_coder_descriptive_statistics, output = "outputs_original_data/Table_2.tex")

## Government Employees ##

reg_type_info <- vdem_full_v13 %>% 
  dplyr::select(country_id, v2x_regime, year) %>% 
  filter(year == 2022)
          
coder_characteristics_wide_gov_employees <- coder_characteristics_wide %>%
  mutate(country_id = main_country_id) %>%
  left_join(reg_type_info, by=c("country_id")) %>%
  filter(government_employment == 1)

table(coder_characteristics_wide_gov_employees$bin_reside)

coder_characteristics_wide_gov_employees %>%
  filter(bin_reside == 1) %>%
  group_by(v2x_regime) %>%
  count()

rm(coder_characteristics_wide_gov_employees, reg_type_info )


#---------------------------------------------------#
#------------- Section 3.4 Index Aggregation -------#
#---------------------------------------------------#

vdem_subset_index <- vdem_full_v13 %>%
  mutate(afi_additive = (v2cafres + v2cafexch + v2clacfree + v2cainsaut +  v2casurv)/5, 
         afi_multiplicative_inst = v2cainsaut_osp * ((v2cafres_osp + v2cafexch_osp + v2clacfree_osp + v2casurv_osp)/4), 
         afi_multiplicative = v2cafres_osp*v2cafexch_osp*v2clacfree_osp*v2cainsaut_osp*v2casurv_osp)

summary(vdem_subset_index$afi_additive)
summary(vdem_subset_index$afi_multiplicative_inst)
summary(vdem_subset_index$afi_multiplicative)

vdem_subset_index %<>% 
  mutate(afi_additive = scales::rescale(afi_additive, to = c(0,1)), 
         afi_multiplicative = scales::rescale(afi_multiplicative, to = c(0,1)), 
         afi_multiplicative_inst = scales::rescale(afi_multiplicative_inst, to = c(0,1)))

summary(vdem_subset_index$afi_additive)
summary(vdem_subset_index$afi_multiplicative_inst)
summary(vdem_subset_index$afi_multiplicative)

#### Figure C1 Appendix ####

ggplot(vdem_subset_index, aes(x = afi_multiplicative)) +
  geom_density(fill="gray")+
  geom_vline(aes(xintercept=mean(afi_multiplicative, na.rm = TRUE)), color="blue",
             linetype="dashed")+
  labs(title="AFI",x="AFI", y = "Density")+
  theme_classic()

f1 <- ggplot(vdem_subset_index, aes(x = v2xca_academ)) +
  geom_density(fill="gray")+
  geom_vline(aes(xintercept=mean(v2xca_academ, na.rm = TRUE)), color="blue",
             linetype="dashed")+
  labs(title="AFI",x="AFI", y = "Density")+
  theme_classic()

f2 <- ggplot(vdem_subset_index, aes(x = afi_additive)) +
  geom_density(fill="gray")+
  geom_vline(aes(xintercept=mean(afi_additive, na.rm = TRUE)), color="blue",
             linetype="dashed")+
  labs(title="additive AFI",x="additive AFI", y = "Density")+
  theme_classic()

f3 <- ggplot(vdem_subset_index, aes(x = afi_multiplicative_inst)) +
  geom_density(fill="gray")+
  geom_vline(aes(xintercept=mean(afi_multiplicative_inst, na.rm = TRUE)), color="blue",
             linetype="dashed")+
  labs(title="multiplicative AFI",x="multiplicative AFI", y = "Density")+
  theme_classic()

f4 <-  ggplot(vdem_subset_index, aes(x = v2xca_academ, y = afi_additive)) +
  geom_point(aes(alpha = 0.05)) +
  labs(title="",x="Acacdemic Freedom Index", y = "additive AFI")+
  theme_classic() +
  guides(alpha = "none")

f5 <-  ggplot(vdem_subset_index, aes(x = v2xca_academ, y = afi_multiplicative_inst)) +
  geom_point(aes(alpha = 0.05)) +
  labs(title="",x="Acacdemic Freedom Index", y = "multiplicative AFI")+
  theme_classic() +
  guides(alpha = "none")

f6 <-  ggplot(vdem_subset_index, aes(x = afi_additive, y = afi_multiplicative_inst)) +
  geom_point(aes(alpha = 0.05)) +
  labs(title="",x="additive AFI", y = "multiplicative AFI")+
  theme_classic() +
  guides(alpha = "none")


ggarrange(f1, f2, f3, f4, f5, f6,
          labels = c("A", "B", "C", "D", "E", "F"),
          ncol = 2, nrow = 3)

ggsave("outputs_original_data/Figure_C1.pdf", width = 30,
       height = 30,
       units = c("cm"), dpi = 1000)

face_validity_exmaples <- vdem_subset_index %>%
  mutate(difference_MAFI_AFI =  v2xca_academ -afi_multiplicative_inst) %>%
  dplyr::select(country_name, year,v2xca_academ, afi_multiplicative_inst, difference_MAFI_AFI, v2cafres, v2cafexch,
                v2clacfree, v2cainsaut,v2casurv) %>%
  drop_na(v2xca_academ)

vdem_subset_index <- vdem_subset_index %>%
  dplyr::select(v2xca_academ, afi_additive, afi_multiplicative_inst)


res2 <- rcorr(as.matrix(vdem_subset_index))
res2

#### Table 3 ####

# cormat : matrix of the correlation coefficients
# pmat : matrix of the correlation p-values
flattenCorrMatrix <- function(cormat, pmat) {
  ut <- upper.tri(cormat)
  data.frame(
    row = rownames(cormat)[row(cormat)[ut]],
    column = rownames(cormat)[col(cormat)[ut]],
    cor  =(cormat)[ut],
    p = pmat[ut]
  )
}

flattenCorrMatrix(res2$r, res2$P)
res2$P


#-------------------------------------------------------------#
#-------------Section 3.5: Number of Observations  -----------#
#-------------------------------------------------------------#

vdem_full_v13 %>%
  drop_na(v2xca_academ) %>%
  count()

vdem_full_v13 %>%
  drop_na(v2xca_academ) %>%
  distinct(country_name)

vdem_full_v13 %>%
  filter(year == 2022) %>%
  distinct(country_name)

vdem_full_v13 %>%
  filter(year == 2022) %>%
  drop_na(v2xca_academ) %>%
  distinct(country_name)


#--------------------------------------------------------------#
#----------- Section 3.6 Predicting Respondent Disagreement----#
#--------------------------------------------------------------#

list_var <- c("v2cafres", "v2cafexch", "v2cainsaut", "v2casurv", "v2clacfree")

## prepare coder level data ##

coder_characteristics_wide <- coder_characteristics_wide %>%
  mutate(reside_in_country = ifelse(reside == main_country_id, 1, 0)) # build reside_in_country variable 
# by using main country of coding and v2zzreside of a coder.

summary(coder_characteristics_wide)

#### Statistical Analysis ####

coder_level_ds <- coder_level_ds %>%
  left_join(coder_characteristics_wide, by = ("coder_id"))

temp_i <- coder_level_ds %>%
  mutate(historical_date = ymd(historical_date)) %>%
  group_by(country_id, historical_date) %>%
  dplyr::summarize(sd_v2cafres = sd(v2cafres, na.rm = TRUE), 
            n_v2cafres = sum(!is.na(v2cafres)), 
            sd_v2cafexch = sd(v2cafexch, na.rm = TRUE), 
            n_v2cafexch =  sum(!is.na(v2cafexch)),
            sd_v2cainsaut = sd(v2cainsaut, na.rm = TRUE), 
            n_v2cainsaut =  sum(!is.na(v2cainsaut)),
            sd_v2casurv = sd(v2casurv, na.rm = TRUE), 
            n_v2casurv =  sum(!is.na(v2casurv)),
            sd_v2clacfree = sd(v2clacfree, na.rm = TRUE), 
            n_v2clacfree =  sum(!is.na(v2clacfree)), 
            v2cafres_conf = mean(v2cafres_conf, na.rm = TRUE), 
            v2cafexch_conf = mean(v2cafexch_conf, na.rm = TRUE), 
            v2cainsaut_conf = mean(v2cainsaut_conf, na.rm = TRUE), 
            v2casurv_conf = mean(v2casurv_conf, na.rm = TRUE), 
            v2clacfree_conf = mean(v2clacfree_conf, na.rm = TRUE))
            
number_of_coders <- coder_level_ds %>%
   mutate(historical_date = ymd(historical_date)) %>%
   group_by(country_id, historical_date) %>%
   dplyr::summarize(number_coders = n_distinct(coder_id), 
             number_of_extern_coders = sum(reside_in_country, na.rm = T)/number_coders)

temp_i <- temp_i %>%
  left_join(number_of_coders, by = c("country_id", "historical_date"))
 

vdem_subset_v13 <- vdem_full_v13 %>%
  mutate(historical_date = ymd(historical_date)) %>%
  dplyr::select(country_id, country_name, year, historical_date, v2cafres, v2cafexch, v2cainsaut, v2casurv, v2clacfree, v2xca_academ,
                v2x_polyarchy, v2x_liberal, v2x_libdem, e_regionpol_6C,e_gdppc, e_pop, e_cow_exports,e_cow_imports, v2x_freexp, e_regionpol) %>%
  left_join(temp_i, by = c("country_id", "historical_date")) %>%
  filter(year >=1900)

vdem_subset_v13$e_regionpol_6C <- as.factor(vdem_subset_v13$e_regionpol_6C)

summary(vdem_subset_v13$number_of_extern_coders)

summary(vdem_subset_v13)

saveRDS(vdem_subset_v13, "data/vdem_subset_v13.rds")

## Freedom to research and teach ##
m1 <- lm_robust(sd_v2cafres ~  year + v2x_freexp + v2cafres + v2cafres:v2cafres + n_v2cafres + number_of_extern_coders + v2cafres_conf + e_regionpol_6C,
                clusters = country_id, 
                data = vdem_subset_v13)
summary(m1)

## Freedom of academic exchange and dissemination ##
m2 <-  lm_robust(sd_v2cafexch ~  year + v2x_freexp + v2cafexch + v2cafexch:v2cafexch + n_v2cafexch + number_of_extern_coders + v2cafexch_conf + e_regionpol_6C,
                 clusters = country_id, 
                 data = vdem_subset_v13)
summary(m2)

## Institutional autonomy ##
m3 <-  lm_robust(sd_v2cainsaut ~  year + v2x_freexp + v2cainsaut + v2cainsaut:v2cainsaut + n_v2cainsaut + number_of_extern_coders + v2cainsaut_conf + e_regionpol_6C,
                 clusters = country_id, 
                 data = vdem_subset_v13)
summary(m3)

## Campus integrity ##
m4 <-  lm_robust(sd_v2casurv ~  year + v2x_freexp + v2casurv + v2casurv:v2casurv + n_v2casurv + number_of_extern_coders + v2casurv_conf + e_regionpol_6C,
                 clusters = country_id, 
                 data = vdem_subset_v13)
summary(m4)

## Freedom of academic and cultural expression ##
m5 <-  lm_robust(sd_v2clacfree ~  year + v2x_freexp + v2clacfree + v2clacfree:v2clacfree + n_v2clacfree + number_of_extern_coders + v2clacfree_conf + e_regionpol_6C,
                 clusters = country_id, 
                 data = vdem_subset_v13)
summary(m5)

## Pooled Analysis with all indicators ##

vdem_subset_v13_pooled <- vdem_subset_v13 %>%
  dplyr::select(country_id, historical_date, year, v2cafres, v2cafexch, v2cainsaut, v2casurv, v2clacfree, sd_v2cafres, sd_v2cafexch, 
                sd_v2cainsaut, sd_v2casurv, sd_v2clacfree, v2cafres_conf, v2cafexch_conf, v2cainsaut_conf, v2casurv_conf, v2clacfree_conf,
                v2xca_academ, 
                v2x_polyarchy, v2x_liberal, v2x_libdem, e_regionpol_6C,e_gdppc, e_pop, e_cow_exports,e_cow_imports, v2x_freexp, number_coders, number_of_extern_coders) %>%
  pivot_longer(!c(country_id, historical_date, year,  v2x_polyarchy, v2x_liberal, v2x_libdem, v2xca_academ, number_coders,
                  e_regionpol_6C,e_gdppc, e_pop, e_cow_exports,e_cow_imports, v2x_freexp,  v2cafres, v2cafexch, v2cainsaut, v2casurv, v2clacfree, number_of_extern_coders, 
                  v2cafres_conf, v2cafexch_conf, v2cainsaut_conf, v2casurv_conf, v2clacfree_conf), names_to = "indicator", values_to = "value_indicator") %>%
  drop_na(value_indicator) %>%
  mutate(v2xca_academ_sqr  = v2xca_academ^2) %>%
  filter(year >=1900) %>%
  mutate(confidence_coders_indidator = case_when(indicator == "sd_v2cafres" ~v2cafres_conf,
                                                 indicator == "sd_v2cafexch" ~v2cafexch_conf,
                                                 indicator == "sd_v2cainsaut" ~v2cainsaut_conf,
                                                 indicator == "sd_v2casurv" ~v2casurv_conf,
                                                 indicator == "sd_v2clacfree" ~v2clacfree_conf))

## pooled regression analysis ##
m6 <-  lm_robust(value_indicator ~ year + v2x_freexp +  v2xca_academ + I(v2xca_academ^2) + number_coders +  
                   number_of_extern_coders + confidence_coders_indidator + e_regionpol_6C + indicator,
                 clusters = country_id, 
                 data = vdem_subset_v13_pooled)
summary(m6)


## Table E1 Supplementary Appendix ##
texreg(list(m1, m2, m3, m4, m5, m6), 
       head.tag = TRUE, body.tag = TRUE,
       file = "outputs_original_data/Table_E1.tex",
       digits = 3,
       custom.coef.names = c("Intercept",
                             "Year", 
                             "Freedom of Expression", 
                             "Freedom to research and teach", 
                             "Number of Coders Freedom to research and teach", 
                             "Percentage of Extern Coders", 
                             "Rater's Mean Confidence", 
                             "Latin America and the Caribbean", 
                             "MENA", 
                             "SSA", 
                             "Western Europe and North America", 
                             "Asia and Pacific", 
                             "Freedom of academic exchange and dissemination",
                             "Number of Coders  Freedom of academic exchange and dissemination",
                             "Rater's Mean Confidence", 
                             "Institutional autonomy", 
                             "Number of Coders Institutional autonomy", 
                             "Rater's Mean Confidence", 
                             "Campus integrity", 
                             "Number of Coders Campus integrity", 
                             "Rater's Mean Confidence", 
                             "Freedom of academic and cultural expression", 
                             "Number of Coders",
                             "Rater's Mean Confidence", 
                             "Academic Freedom Index", 
                             "Academic Freedom Index sqr.",
                             "Number of Coders Acadmeic Freedom Index", 
                             "Rater's Mean Confidence",
                             "Freedom to research and teach",
                             "Institutional autonomy", 
                             "Campus integrity", 
                             "Freedom of academic and cultural expression"),
       booktabs = TRUE,
       use.packages = FALSE,
       caption = "Linear Models predicting respondents disagreement",
       fontsize = "scriptsize",
       stars = c(0.001, 0.01, 0.05), 
       label = "Table:E1", 
       longtable = TRUE)


#### Figure 1: Coefficient Plot ####

# Put model estimates into temporary data.frames:
model6Frame <- data.frame(term = m6$term,
                          estimate = m6$coefficients,
                          std.error = m6$std.error,
                          conf.low = m6$conf.low,
                          conf.high = m6$conf.high,
                          modelName = "Pooled Model") %>%
  by_2sd(vdem_subset_v13_pooled)

# Specify the width of your confidence intervals
interval1 <- -qnorm((1-0.9)/2)  # 90% multiplier
interval2 <- -qnorm((1-0.95)/2)  # 95% multiplier

# Combine these data.frames
model6Frame <- model6Frame %>%
  filter(!str_detect(term, "(Intercept)")) %>%
  mutate(term = case_when(term == "indicatorsd_v2cafres" ~ "Dummy Freedom to research and teach", 
                          term == "indicatorsd_v2cainsaut" ~ "Dummy Institutional autonomy", 
                          term == "indicatorsd_v2casurv" ~ "Dummy Campus integrity", 
                          term == "indicatorsd_v2clacfree" ~ "Dummy Freedom of academic and cultural expression", 
                          term == "I(v2xca_academ^2)" ~ "Level of Academic Freedom squared", 
                          term == "v2xca_academ" ~ "Level of Academic Freedom", 
                          term == "year" ~ "Year", 
                          term == "v2x_freexp" ~ "Freedom of Expression",
                          term == "number_coders" ~ "Number of respondents", 
                          term == "number_of_extern_coders" ~ "Percentage External Coders", 
                          term == "confidence_coders_indidator" ~ "Rater's Mean Confidence", 
                          term == "e_regionpol_6C2" ~ "Latin America and the Caribbean", 
                          term == "e_regionpol_6C3" ~ "MENA", 
                          term == "e_regionpol_6C4" ~ "SSA", 
                          term == "e_regionpol_6C5" ~ "Western Europe and North America", 
                          term == "e_regionpol_6C6" ~ "Asia and Pacific")) %>%
  mutate(conf.low = estimate - std.error*interval2, 
         conf.high = estimate + std.error*interval2)



# Plot
ggplot(model6Frame, aes( x = factor(term, levels=c("Latin America and the Caribbean", 
                                                   "MENA",
                                                   "SSA", 
                                                   "Western Europe and North America", 
                                                   "Asia and Pacific",
                                                   "Dummy Freedom to research and teach", 
                                                   "Dummy Institutional autonomy", 
                                                   "Dummy Campus integrity", 
                                                   "Dummy Freedom of academic and cultural expression", 
                                                   "Rater's Mean Confidence", 
                                                   "Percentage External Coders",
                                                   "Number of respondents", 
                                                   "Level of Academic Freedom squared",
                                                   "Level of Academic Freedom", 
                                                   "Freedom of Expression", 
                                                   "Year")))) +
  geom_hline(yintercept = 0, colour = gray(1/2), lty = 2) +
  geom_linerange(aes(ymin = estimate - std.error*interval1,
                     ymax = estimate + std.error*interval1),
                 lwd = 1, position = position_dodge(width = 2/3)) +
  geom_pointrange(aes(y = estimate, ymin = estimate - std.error*interval2,
                      ymax = estimate + std.error*interval2,),
                  lwd = 1/2, position = position_dodge(width = 2/3),
                  fill = "WHITE") +
  coord_flip() + theme_bw() +
  ggtitle("") +
  labs(color= "Models", 
       shape= "Models",
       x = "", 
       y = "Estimate") + 
  theme(legend.position="bottom") 

## Figure 3 ##

ggsave("outputs_original_data/Figure_1.pdf", width = 20,
       height = 15,
       units = c("cm"), dpi = 1000)
ggsave("outputs_original_data/Figure_1.png", width = 20,
       height = 15,
       units = c("cm"), dpi = 1000)

##Margins 

m6_me <- ggpredict(m6, terms = c("v2xca_academ[all]"))

ggplot(m6_me, aes(x = x, y = predicted)) +
  geom_line() +
  geom_ribbon(aes(ymin= conf.low, ymax = conf.high), alpha = 0.3) +
  theme_bw() +
  ggtitle("") +
  labs(color= "", 
       y= "Predicted Standard Deviation of Respondents",
       x = "Academic Freedom Index") + 
  theme(legend.position="bottom") +
  scale_color_viridis(discrete = TRUE, option = "D")


ggsave("outputs_original_data/Figure_2.pdf", width = 20,
       height = 12,
       units = c("cm"), dpi = 1000)
ggsave("outputs_original_data/Figure_2.png", width = 20,
       height = 12,
       units = c("cm"), dpi = 1000)

#### Figure Coefficient Plot Appendix ####

# Put model estimates into temporary data.frames:
model1Frame <- data.frame(term = m1$term,
                          estimate = m1$coefficients,
                          std.error = m1$std.error,
                          conf.low = m1$conf.low,
                          cond.high = m1$conf.high,
                          modelName = "Freedom to research\nand teach") %>%
  by_2sd(vdem_subset_v13_pooled)


model2Frame <- data.frame(term = m2$term,
                          estimate = m2$coefficients,
                          std.error = m2$std.error,
                          conf.low = m2$conf.low,
                          cond.high = m2$conf.high,
                          modelName = "Freedom of academic\nexchange and dissemination") %>%
  by_2sd(vdem_subset_v13_pooled)


model3Frame <- data.frame(term = m3$term,
                          estimate = m3$coefficients,
                          std.error = m3$std.error,
                          conf.low = m3$conf.low,
                          cond.high = m3$conf.high,
                          modelName = " Institutional\nautonomy") %>%
  by_2sd(vdem_subset_v13_pooled)

model4Frame <- data.frame(term = m4$term,
                          estimate = m4$coefficients,
                          std.error = m4$std.error,
                          conf.low = m4$conf.low,
                          cond.high = m4$conf.high,
                          modelName = "Campus\nintegrity") %>%
  by_2sd(vdem_subset_v13_pooled)

model5Frame <- data.frame(term = m5$term,
                          estimate = m5$coefficients,
                          std.error = m5$std.error,
                          conf.low = m5$conf.low,
                          cond.high = m5$conf.high,
                          modelName = "Freedom of academic\nand cultural expression") %>%
  by_2sd(vdem_subset_v13_pooled)

# Combine these data.frames
allModelFrame <- data.frame(rbind(model1Frame, model2Frame, model3Frame, model4Frame, model5Frame))  

allModelFrame <- allModelFrame %>%
  mutate(term = case_when(term == "(Intercept)" ~ "Intercept", 
                          term == "year" ~ "Year", 
                          term == "v2x_freexp" ~ "Freedom of Expression", 
                          term == "v2cafres" ~ "Freedom to research and teach", 
                          term == "n_v2cafres" ~ "Number of respondents", 
                          term == "v2cafexch" ~ "Freedom of academic exchange and dissemination", 
                          term == "n_v2cafexch" ~ "Number of respondents", 
                          term == "v2cainsaut" ~ "Institutional autonomy", 
                          term == "n_v2cainsaut" ~ "Number of respondents", 
                          term == "v2casurv" ~ "Campus Integrity", 
                          term == "n_v2casurv" ~ "Number of respondents", 
                          term == "v2clacfree" ~ "Freedom of academic and cultural expression", 
                          term == "n_v2clacfree" ~ "Number of respondents", 
                          term == "number_of_extern_coders" ~ "Percentage External Coders", 
                          term == "e_regionpol_6C2" ~ "Latin America and the Caribbean", 
                          term == "e_regionpol_6C3" ~ "MENA", 
                          term == "e_regionpol_6C4" ~ "SSA", 
                          term == "e_regionpol_6C5" ~ "Western Europe and North America", 
                          term == "e_regionpol_6C6" ~ "Asia and Pacific", 
                          term == "v2cafres_conf"~ "Rater's Mean Confidence", 
                          term == "v2cafexch_conf"~ "Rater's Mean Confidence", 
                          term == "v2cainsaut_conf" ~ "Rater's Mean Confidence", 
                          term == "v2casurv_conf" ~ "Rater's Mean Confidence", 
                          term == "v2clacfree_conf"~ "Rater's Mean Confidence")) %>%
  filter(!str_detect(term, "Intercept"))


# Specify the width of your confidence intervals
interval1 <- -qnorm((1-0.9)/2)  # 90% multiplier
interval2 <- -qnorm((1-0.95)/2)  # 95% multiplier

# Plot
zp1 <- ggplot(allModelFrame, aes(colour = modelName, x = factor(term, levels=c("Latin America and the Caribbean", 
                                                                               "MENA",
                                                                               "SSA", 
                                                                               "Western Europe and North America", 
                                                                               "Asia and Pacific",
                                                                               "Freedom to research and teach", 
                                                                               "Freedom of academic exchange and dissemination", 
                                                                               "Institutional autonomy", 
                                                                               "Campus Integrity", 
                                                                               "Freedom of academic and cultural expression",
                                                                               "Rater's Mean Confidence",
                                                                               "Percentage External Coders", 
                                                                               "Number of respondents", 
                                                                               "Freedom of Expression",
                                                                               "Year")))) +
  geom_hline(yintercept = 0, colour = gray(1/2), lty = 2) +
  geom_linerange(aes(ymin = estimate - std.error*interval1,
                     ymax = estimate + std.error*interval1),
                 lwd = 1, position = position_dodge(width = 2/3)) +
  geom_pointrange(aes(y = estimate, ymin = estimate - std.error*interval2,
                      ymax = estimate + std.error*interval2,),
                  lwd = 1/2, position = position_dodge(width = 2/3),
                  fill = "WHITE") +
  coord_flip() + theme_bw() +
  ggtitle("") +
  labs(color= "Models", 
       shape= "Models",
       x = "") + 
  theme(legend.position="bottom") +
  scale_color_viridis(discrete = TRUE, option = "D")

print(zp1)  # The trick to these is position_dodge().

## Figure 3 ##

ggsave("outputs_original_data/Figure_E1.pdf", width = 20,
       height = 25,
       units = c("cm"), dpi = 1000)
ggsave("outputs_original_data/Figure_E1s.png", width = 20,
       height = 25,
       units = c("cm"), dpi = 1000)

#-------------------------------------------------------------------------#
#------------ Section 3.7 Analyzing Individual Respondent Biases ---------#
#-------------------------------------------------------------------------#

## prepare coder level data ##

summary(coder_characteristics_wide)

coder_level_ds <- read.csv(file.path(data_storage, "vdem_13_coder_level", "coder_level_ds_v13.csv"))

coder_level_ds <- coder_level_ds %>%
  left_join(coder_characteristics_wide, by = ("coder_id"))

## subset country-year data ##

vdemsubset_v13 <- vdem_full_v13 %>%
  dplyr::select(country_id, year, historical_date, v2x_polyarchy, v2x_liberal, v2x_libdem, e_regionpol_6C,
                e_gdppc, e_pop, e_cow_exports,e_cow_imports)

coder_level_ds_subset <- coder_level_ds %>%
  dplyr::select(country_text_id, country_id, coder_id, historical_date, starts_with(c("v2cafres", "v2cafexch", "v2cainsaut", 
                "v2casurv", "v2clacfree")), gender, phd, government_employment, age_block, reside, main_country_id, reside,    
                v2zzfremrk, v2zzelcdem, v2zzlibdem, v2zztimespent, v2zzsatisf, v2zzfirstreas) %>%
  mutate(historical_date = ymd(historical_date)) %>%
  left_join(vdemsubset_v13, by = c("country_id", "historical_date")) %>%
  filter(year >= 1900)

coder_level_ds_subset <- coder_level_ds_subset %>%
  mutate(reside_in_country = ifelse(country_id == reside, 1, 
                                    ifelse(country_id != reside, 0, NA)))


coder_level_ds_subset$gender <- as.factor(coder_level_ds_subset$gender)
coder_level_ds_subset$age_block <- as.factor(coder_level_ds_subset$age_block)
coder_level_ds_subset$phd <- as.factor(coder_level_ds_subset$phd)
coder_level_ds_subset$government_employment <- as.factor(coder_level_ds_subset$government_employment)
coder_level_ds_subset$reside <- as.factor(coder_level_ds_subset$reside)
coder_level_ds_subset$reside_in_country <- as.factor(coder_level_ds_subset$reside_in_country)
coder_level_ds_subset$year <- as.factor(coder_level_ds_subset$year)

summary(coder_level_ds_subset)

## Freedom to research and teach ##
m1 <- lm_robust(v2cafres ~ gender + age_block + phd + government_employment + reside_in_country +
                  v2zzfremrk + v2zzelcdem + v2zzlibdem + v2zztimespent + v2x_liberal + v2x_polyarchy + v2zzlibdem:v2x_liberal +
           v2zzelcdem:v2x_polyarchy +
             as.factor(e_regionpol_6C)  + e_gdppc +year,
           clusters = country_id, 
         data = coder_level_ds_subset)
summary(m1)

## Freedom of academic exchange and dissemination ##
m2 <- lm_robust(v2cafexch ~ gender + age_block + phd + government_employment + reside_in_country +
                  v2zzfremrk + v2zzelcdem + v2zzlibdem + v2zztimespent + v2x_liberal + v2x_polyarchy + v2zzlibdem:v2x_liberal +
                  v2zzelcdem:v2x_polyarchy +
                  as.factor(e_regionpol_6C)  + e_gdppc +year,
                clusters = country_id, 
                data = coder_level_ds_subset)
summary(m2)

## Institutional autonomy ##
m3 <- lm_robust(v2cainsaut ~ gender + age_block + phd + government_employment + reside_in_country +
                  v2zzfremrk + v2zzelcdem + v2zzlibdem + v2zztimespent + v2x_liberal + v2x_polyarchy + v2zzlibdem:v2x_liberal +
                  v2zzelcdem:v2x_polyarchy +
                  as.factor(e_regionpol_6C)  + e_gdppc +year,
                clusters = country_id, 
                data = coder_level_ds_subset)
summary(m3)

## Campus integrity ##
m4 <- lm_robust(v2casurv ~ gender + age_block + phd + government_employment + reside_in_country +
                  v2zzfremrk + v2zzelcdem + v2zzlibdem + v2zztimespent + v2x_liberal + v2x_polyarchy + v2zzlibdem:v2x_liberal +
                  v2zzelcdem:v2x_polyarchy +
                  as.factor(e_regionpol_6C)  + e_gdppc +year,
                clusters = country_id, 
                data = coder_level_ds_subset)
summary(m4)

## Freedom of academic and cultural expression ##
m5 <- lm_robust(v2clacfree ~ gender + age_block + phd + government_employment + reside_in_country +
                  v2zzfremrk + v2zzelcdem + v2zzlibdem + v2zztimespent + v2x_liberal + v2x_polyarchy + v2zzlibdem:v2x_liberal +
                  v2zzelcdem:v2x_polyarchy +
                  as.factor(e_regionpol_6C)  + e_gdppc +year,
                clusters = country_id, 
                data = coder_level_ds_subset)
summary(m5)

# No of observations #

m1$nobs
m2$nobs
m3$nobs
m4$nobs
m5$nobs

## Pooled Analysis with all indicators ##

coder_level_ds_subset_pooled <- coder_level_ds_subset %>%
  dplyr::select(country_text_id, country_id, coder_id, historical_date, year, v2cafres, v2cafexch, v2cainsaut, v2casurv, v2clacfree,
                v2zzfremrk, v2zzelcdem, v2zzlibdem, v2zztimespent, v2zzsatisf, v2zzfirstreas,  v2x_polyarchy, v2x_liberal, v2x_libdem,
                e_regionpol_6C,e_gdppc, e_pop, e_cow_exports,e_cow_imports, gender, age_block, phd, government_employment,reside_in_country) %>%
  pivot_longer(!c(country_text_id, country_id, coder_id, historical_date, year,v2zzfremrk, v2zzelcdem, v2zzlibdem, v2zztimespent, 
                  v2zzsatisf, v2zzfirstreas,  v2x_polyarchy, v2x_liberal, v2x_libdem, e_regionpol_6C,e_gdppc, e_pop,
                  e_cow_exports,e_cow_imports, gender, age_block, phd, government_employment,reside_in_country), names_to = "indicator", values_to = "coder_value_indicator") %>%
  drop_na(coder_value_indicator)


coder_level_ds_subset_pooled$gender <- as.factor(coder_level_ds_subset_pooled$gender)
coder_level_ds_subset_pooled$age_block <- as.factor(coder_level_ds_subset_pooled$age_block)
coder_level_ds_subset_pooled$phd <- as.factor(coder_level_ds_subset_pooled$phd)
coder_level_ds_subset_pooled$government_employment <- as.factor(coder_level_ds_subset_pooled$government_employment)
coder_level_ds_subset_pooled$reside_in_country <- as.factor(coder_level_ds_subset_pooled$reside_in_country)


## pooled regression analysis ##
m6 <- lm_robust(coder_value_indicator ~gender + age_block + phd + government_employment + reside_in_country +
                  v2zzfremrk + v2zzelcdem + v2zzlibdem + v2zztimespent + v2x_liberal + v2x_polyarchy + 
                  v2zzlibdem:v2x_liberal + v2zzelcdem:v2x_polyarchy + e_gdppc + 
                  as.factor(e_regionpol_6C) + as.factor(year) + as.factor(indicator),
                  clusters = country_id, 
                  data = coder_level_ds_subset_pooled)
summary(m6)
m6$nobs

## Table E2 Supplementary Appendix ##
texreg(list(m1, m2, m3, m4, m5, m6), 
       head.tag = TRUE, body.tag = TRUE,
       file = "outputs_original_data/Table_E2.tex",
       digits = 3,
       omit.coef=c('year'), 
       custom.coef.names = c("Intercept",
                             "Respondent's Gender", 
                             "Respondent's Age: Middle Cohort", 
                             "Respondent's Age: Oldest Cohort", 
                             "Respondent: PhD education", 
                             "Respondent: Government employed", 
                             "Respondent: Reside in country", 
                             "Respondent supports freemarket", 
                             "Respondent supports electoral democracy", 
                             "Respondent supports liberal democracy", 
                             "time spent for coding", 
                             "Country liberal component", 
                             "Country electoral democracy level", 
                             "Latin America and the Caribbean", 
                             "MENA",
                             "SSA", 
                             "Western Europe and North America",
                             "Asia and Pacific",
                             "GDP pc", 
                             "Respondent supports liberal democracy * LibDem", 
                             "Respondent supports electoral democracy * EDI", 
                             "Dummy Freedom to research and teach", 
                             "Dummy Institutional Autonomy", 
                             "Dummy Campus Integrity", 
                             "Freedom of academic and cultural expression"),
       booktabs = TRUE,
       use.packages = FALSE,
       caption = "Linear Models predicting respondents rating with country characteristics",
       fontsize = "scriptsize",
       stars = c(0.001, 0.01, 0.05), 
       label = "Table:E2", 
       longtable = TRUE)

#### Figure E2: Coefficient Plot ####

# Put model estimates into temporary data.frames:
model1Frame <- data.frame(term = m1$term,
                           estimate = m1$coefficients,
                           std.error = m1$std.error,
                           conf.low = m1$conf.low,
                           cond.high = m1$conf.high,
                           modelName = "Freedom to research\nand teach") %>%
  by_2sd(coder_level_ds_subset)
  

model2Frame <- data.frame(term = m2$term,
                          estimate = m2$coefficients,
                          std.error = m2$std.error,
                          conf.low = m2$conf.low,
                          cond.high = m2$conf.high,
                          modelName = "Freedom of academic\nexchange and dissemination") %>%
  by_2sd(coder_level_ds_subset)


model3Frame <- data.frame(term = m3$term,
                          estimate = m3$coefficients,
                          std.error = m3$std.error,
                          conf.low = m3$conf.low,
                          cond.high = m3$conf.high,
                          modelName = " Institutional\nautonomy") %>%
  by_2sd(coder_level_ds_subset)

model4Frame <- data.frame(term = m4$term,
                          estimate = m4$coefficients,
                          std.error = m4$std.error,
                          conf.low = m4$conf.low,
                          cond.high = m4$conf.high,
                          modelName = "Campus\nintegrity") %>%
  by_2sd(coder_level_ds_subset)

model5Frame <- data.frame(term = m5$term,
                          estimate = m5$coefficients,
                          std.error = m5$std.error,
                          conf.low = m5$conf.low,
                          cond.high = m5$conf.high,
                          modelName = "Freedom of academic\nand cultural expression") %>%
  by_2sd(coder_level_ds_subset)

# Combine these data.frames
allModelFrame <- data.frame(rbind(model1Frame, model2Frame, model3Frame, model4Frame, model5Frame))  


allModelFrame <- allModelFrame %>%
  filter(!str_detect(term, "year")) %>%
  filter(!str_detect(term, "e_regionpol")) %>%
  mutate(term = case_when(term == "(Intercept)" ~ "Intercept", 
                          term == "gender1" ~ "Respondent's Gender", 
                          term == "age_block2" ~ "Respondent's Age: Middle Cohort", 
                          term == "age_block3"~  "Respondent's Age: Oldest Cohort",
                          term == "phd1" ~ "Respondent's Education: PhD", 
                          term == "government_employment1" ~ "Respondent: government employed", 
                          term == "reside_in_country1" ~ "Respondent: Reside in country", 
                          term == "v2zzfremrk" ~ "Respondent supports freemarket", 
                              term == "v2zzelcdem" ~ "Respondent supports electoral democracy", 
                              term == "v2zzlibdem" ~ "Respondent supports liberal democracy", 
                              term == "v2zztimespent" ~ "time spent for coding", 
                              term == "v2x_liberal" ~ "Country liberal component level", 
                              term == "v2x_polyarchy" ~ "Country electoral democracy level", 
                              term == "e_regionpol_6C2" ~ "Latin America and the Caribbean", 
                              term == "e_regionpol_6C3" ~ "MENA", 
                              term == "e_regionpol_6C4" ~ "SSA", 
                              term == "e_regionpol_6C5" ~ "Western Europe and North America", 
                              term == "e_regionpol_6C6" ~ "Asia and Pacific", 
                              term == "e_gdppc" ~ "GDP pc", 
                              term == "v2zzlibdem:v2x_liberal" ~ "Respondent supports liberal democracy * LibDem", 
                              term == "v2zzelcdem:v2x_polyarchy" ~ "Respondent supports electoral democracy * EDI")) 


# Specify the width of your confidence intervals
interval1 <- -qnorm((1-0.9)/2)  # 90% multiplier
interval2 <- -qnorm((1-0.95)/2)  # 95% multiplier

# Plot
zp1 <- ggplot(allModelFrame, aes(colour = modelName, x = factor(term, levels=c("Respondent's Age: Oldest Cohort", 
                                                                               "Respondent's Age: Middle Cohort",
                                                                               "Respondent's Gender", 
                                                                               "Respondent's Education: PhD", 
                                                                               "Respondent: government employed", 
                                                                               "Respondent: Reside in country",
                                                                               "time spent for coding",   
                                                                                   "Respondent supports freemarket", 
                                                                                   "Respondent supports electoral democracy",
                                                                                   "Respondent supports liberal democracy", 
                                                                                   "Country electoral democracy level", 
                                                                                   "Country liberal component level", 
                                                                                   "Respondent supports electoral democracy * EDI", 
                                                                                   "Respondent supports liberal democracy * LibDem", 
                                                                                   "GDP pc", 
                                                                                   "Asia and Pacific",
                                                                                   "Western Europe and North America",
                                                                                   "SSA", 
                                                                                   "MENA",
                                                                                   "Latin America and the Caribbean", 
                                                                                   "Intercept")))) +
  geom_hline(yintercept = 0, colour = gray(1/2), lty = 2) +
  geom_linerange(aes(ymin = estimate - std.error*interval1,
                     ymax = estimate + std.error*interval1),
                 lwd = 1, position = position_dodge(width = 2/3)) +
  geom_pointrange(aes(y = estimate, ymin = estimate - std.error*interval2,
                      ymax = estimate + std.error*interval2,),
                  lwd = 1/2, position = position_dodge(width = 2/3),
                  fill = "WHITE") +
  coord_flip() + theme_bw() +
  ggtitle("") +
  labs(color= "Models", 
       shape= "Models",
       x = "") + 
  theme(legend.position="bottom") +
  scale_color_viridis(discrete = TRUE, option = "D")

print(zp1)  # The trick to these is position_dodge().

## Figure 3 ##

ggsave("outputs_original_data/Figure_E2.pdf", width = 20,
       height = 25,
       units = c("cm"), dpi = 1000)
ggsave("outputs_original_data/Figure_E2.png", width = 20,
       height = 25,
       units = c("cm"), dpi = 1000)

# Specify the width of your confidence intervals
interval1 <- -qnorm((1-0.9)/2)  # 90% multiplier
interval2 <- -qnorm((1-0.95)/2)  # 95% multiplier

#### Figure 3: Coefficient Plot Pooled Model ####

# Put model estimates into temporary data.frames:
model6Frame <- data.frame(term = m6$term,
                          estimate = m6$coefficients,
                          std.error = m6$std.error,
                          conf.low = m6$conf.low,
                          conf.high = m6$conf.high,
                          modelName = "Pooled Model") %>%
  by_2sd(coder_level_ds_subset_pooled) %>%
  mutate(conf.low = estimate - std.error*interval2, 
         conf.high = estimate + std.error*interval2)

# Combine these data.frames
model6Frame <- model6Frame %>%
  filter(!str_detect(term, "(year)")) %>%
  filter(!str_detect(term, "(Intercept)")) %>%
  filter(!str_detect(term, "indicator")) %>%
  filter(!str_detect(term, "e_regionpol")) %>%
  mutate(term = case_when(term == "(Intercept)" ~ "Intercept", 
                          term == "gender1" ~ "Respondent's Gender", 
                          term == "age_block2" ~ "Respondent's Age: Middle Cohort", 
                          term == "age_block3"~  "Respondent's Age: Oldest Cohort",
                          term == "phd1" ~ "Respondent's Education: PhD", 
                          term == "government_employment1" ~ "Respondent: government employed", 
                          term == "reside_in_country1" ~ "Respondent: Reside in country", 
                          term == "v2zzfremrk" ~ "Respondent support free markets",
                          term == "v2zzelcdem" ~ "Respondent supports electoral democracy", 
                          term == "v2zzlibdem" ~ "Respondent supports liberal component", 
                          term == "v2zztimespent" ~ "Time spent for coding", 
                          term == "v2x_liberal" ~ "Liberal Component", 
                          term == "v2x_polyarchy" ~ "Electoral Democracy", 
                          term == "e_gdppc" ~ "GDP per capita", 
                          term == "v2zzlibdem:v2x_liberal" ~ "Respondent supports liberal component * Liberal Component", 
                          term == "v2zzelcdem:v2x_polyarchy" ~"Respondent supports electoral democracy * Electoral Democracy"))


# Plot
ggplot(model6Frame, aes( x = factor(term, levels=c("GDP per capita", 
                                                   "Time spent for coding", 
                                                   "Respondent support free markets", 
                                                   "Respondent supports electoral democracy", 
                                                   "Respondent supports liberal component", 
                                                   "Electoral Democracy",
                                                   "Liberal Component", 
                                                   "Respondent supports electoral democracy * Electoral Democracy", 
                                                   "Respondent supports liberal component * Liberal Component", 
                                                   "Respondent's Age: Oldest Cohort", 
                                                   "Respondent's Age: Middle Cohort",
                                                   "Respondent's Gender", 
                                                   "Respondent's Education: PhD", 
                                                   "Respondent: government employed", 
                                                   "Respondent: Reside in country")))) +
  geom_hline(yintercept = 0, colour = gray(1/2), lty = 2) +
  geom_linerange(aes(ymin = estimate - std.error*interval1,
                     ymax = estimate + std.error*interval1),
                 lwd = 1, position = position_dodge(width = 2/3)) +
  geom_pointrange(aes(y = estimate, ymin = estimate - std.error*interval2,
                      ymax = estimate + std.error*interval2,),
                  lwd = 1/2, position = position_dodge(width = 2/3),
                  fill = "WHITE") +
  coord_flip() + theme_bw() +
  ggtitle("") +
  labs(color= "Models", 
       shape= "Models",
       x = "", 
       y = "Estimate") + 
  theme(legend.position="bottom") 

## Figure 3 ##

ggsave("outputs_original_data/Figure_3.pdf", width = 20,
       height = 15,
       units = c("cm"), dpi = 1000)
ggsave("outputs_original_data/Figure_3.png", width = 20,
       height = 15,
       units = c("cm"), dpi = 1000)

##Margins 

coder_level_ds_subset_pooled$e_regionpol_6C <- as.factor(coder_level_ds_subset_pooled$e_regionpol_6C)


## pooled regression analysis ##
m6 <- lm_robust(coder_value_indicator ~gender + age_block + phd + government_employment + reside_in_country +
                  v2zzfremrk + v2zzelcdem + v2zzlibdem + v2zztimespent + v2x_liberal + v2x_polyarchy + 
                  v2zzlibdem:v2x_liberal + v2zzelcdem:v2x_polyarchy + e_gdppc + 
                  e_regionpol_6C + as.factor(year) + as.factor(indicator),
                clusters = country_id, 
                data = coder_level_ds_subset_pooled)
summary(m6)
m6$nobs

summary(coder_level_ds_subset_pooled$v2zzlibdem)

F4a <- plot_predictions(m6, condition = list("v2x_liberal", v2zzlibdem = "minmax"))  +
  theme_bw() +
  ggtitle("") +
  labs(color= "", fill = "",
       y= "Predicted Respondent Ratings",
       x = "Liberal Component") + 
  theme(legend.position="bottom") +
  scale_color_grey(start = 0.05,
                   end = 0.3) +
  scale_fill_grey(start = 0.05,
                  end = 0.3)

F4b <- plot_predictions(m6, condition = list("v2x_polyarchy", v2zzelcdem = "minmax"))  +
  theme_bw() +
  ggtitle("") +
  labs(color= "", fill = "",
       y= "Predicted Respondent Ratings",
       x = "Electoral Democracy Index") + 
  theme(legend.position="bottom") +
  scale_color_grey(start = 0.05,
                   end = 0.3) +
  scale_fill_grey(start = 0.05,
                  end = 0.3)

ggarrange(F4a, F4b)

ggsave("outputs_original_data/Figure_4.pdf", width = 20,
       height = 12,
       units = c("cm"), dpi = 1000)
ggsave("outputs_original_data/Figure_4.png", width = 20,
       height = 12,
       units = c("cm"), dpi = 1000)

#### Testing Interaction Effect between Female/Male Respondent and Reside or born in a country ####
## pooled regression analysis ##
m7 <- lm_robust(coder_value_indicator ~ gender + age_block + phd + government_employment + reside_in_country +
                  v2zzfremrk + v2zzelcdem + v2zzlibdem + v2zztimespent + v2x_liberal + v2x_polyarchy + v2zzlibdem:v2x_liberal +
                  v2zzelcdem:v2x_polyarchy + gender:reside_in_country + e_gdppc +
                  as.factor(e_regionpol_6C) + as.factor(year) + as.factor(indicator),
                clusters = country_id, 
                data = coder_level_ds_subset_pooled)
summary(m7)
m7$nobs


###################################################################
#### Marginal Effects for Model 6 and Model 7 conditional on gender 
###################################################################

m6_ame <- avg_comparisons(m6,  variables = list(gender = c("0", "1")))
m6_ame <- m6_ame %>%
  mutate(contrast = "Female - Male")

f2a <- ggplot(m6_ame, aes(x = contrast, y = estimate, ymin = conf.low, ymax = conf.high)) +
  geom_point() +
  geom_linerange() +
  geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
  theme_bw() +
  ggtitle("") +
  labs(color= "", fill = "",
       y= "Average Marginal Effect",
       x = "Contrast") + 
  theme(legend.position="bottom") +
  scale_color_grey() +
  scale_fill_grey()

avg_comparisons(m6,  variables = list(gender = c("0", "1")), hypothesis = 0)

f2b <- plot_predictions(m6, condition = c("gender")) +
  theme_bw() +
  ggtitle("") +
  labs(color= "", fill = "",
       y= "Predicted Values",
       x = "Gender") + 
  theme(legend.position="bottom") +
  scale_color_grey() +
  scale_fill_grey() +
  scale_x_discrete(breaks=c("0","1"),
                   labels=c("Male", "Female"))

m7_ame <- avg_comparisons(m7,  variables = list(gender = c("0", "1")), 
                                                by = "reside_in_country")

hypotheses(m7_ame, "b1 = b2")

f2c <- ggplot(m7_ame, aes(x = reside_in_country, y = estimate, ymin = conf.low, ymax = conf.high)) +
  geom_point() +
  geom_linerange() +
  geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
  theme_bw() +
  ggtitle("") +
  labs(color= "", fill = "",
       y= "Average Marginal Effect",
       x = "Contrast") + 
  theme(legend.position="bottom") +
  scale_color_grey() +
  scale_fill_grey() +
  scale_x_discrete(breaks=c("0","1"),
                   labels=c("Not reside in country", "Reside in country"))

f2d <- plot_predictions(m6, condition = c("gender", "reside_in_country"))+
  theme_bw() +
  ggtitle("") +
  labs(color= "", fill = "",
       y= "Predicted Values",
       x = "Gender") + 
  theme(legend.position="bottom") +
  scale_color_grey(labels=c("0" = "Not reside in country", "1"="Reside in country")) +
  scale_fill_grey() +
  scale_x_discrete(breaks=c("0","1"),
                   labels=c("Male", "Female")) 


ggarrange(f2a, f2b, f2c, f2d, labels = c("A", "B", "C", "D"))

ggsave("outputs_original_data/Figure_5.pdf", width = 25,
       height = 15,
       units = c("cm"), dpi = 1000)
ggsave("outputs_original_data/Figure_5.png", width = 25,
       height = 15,
       units = c("cm"), dpi = 1000)


#--------------------------------------------------------------#
#----- Section 4: Traditional Convergent Validity Analysis ----#
#--------------------------------------------------------------#

## prepare Freedom House data ##

freedomhouse_data <- read_excel("data/freedom_house/All_data_FIW_2013-2023.xlsx", sheet = 2, col_names = TRUE) 
freedomhouse_data_varnames <- as.character(freedomhouse_data[1,])

freedomhouse_data <- freedomhouse_data %>%
  rename_all(~freedomhouse_data_varnames) 
freedomhouse_data <- freedomhouse_data[-1,]

freedomhouse_data_subset <- freedomhouse_data %>%
  dplyr::select(`Country/Territory`, Edition, Region, D3) %>%
  rename(country = `Country/Territory`, 
         academic_freedom_FH = D3) %>%
  mutate(year = as.numeric(Edition) -1)

rm(freedomhouse_data_varnames, freedomhouse_data)

min(freedomhouse_data_subset$year)
max(freedomhouse_data_subset$year)


## prepare V-Dem data ##

vdem_subset_v13 <- vdem_full_v13 %>%
  mutate(historical_date = ymd(historical_date)) %>%
  dplyr::select(country_id, country_name, year, v2cafres, v2cafexch, v2cainsaut, v2casurv, v2clacfree, v2xca_academ)

## merge FH and V-dem data ##

freedomhouse_data_subset$country_id <- countrycode(freedomhouse_data_subset$country, origin = "country.name", destination =  "vdem" )

vdem_subset_v13_fh <- vdem_subset_v13 %>%
  left_join(freedomhouse_data_subset, by = c("country_id", "year"))

## rescale variable to 0-1 ##
vdem_subset_v13_fh$academic_freedom_FH <- as.double(vdem_subset_v13_fh$academic_freedom_FH)
summary(vdem_subset_v13_fh$academic_freedom_FH )

vdem_subset_v13_fh <- vdem_subset_v13_fh %>%
  mutate(academic_freedom_FH = scales::rescale(academic_freedom_FH, to = c(0, 1)))


## Each year separate ##

plot_2012  <- ggplot(data = subset(vdem_subset_v13_fh, vdem_subset_v13_fh$year == 2012), aes(x=academic_freedom_FH, y = v2xca_academ)) +
  geom_point() +
  geom_smooth(method='lm', formula= y~x) +
  xlim(0,1)+
  ylim(c(0,1))+
  labs(x = "FH Academic Freedom (normalized)", 
       y = "V-Dem Academic Freedom Index", 
       title = "2012") +
  theme_pubr()


plot_2013  <- ggplot(data = subset(vdem_subset_v13_fh, vdem_subset_v13_fh$year == 2013), aes(x=academic_freedom_FH, y = v2xca_academ)) +
  geom_point() +
  geom_smooth(method='lm', formula= y~x) +
  xlim(0,1)+
  ylim(c(0,1))+
  labs(x = "FH Academic Freedom (normalized)", 
       y = "V-Dem Academic Freedom Index", 
       title = "2013") +
  theme_pubr()

plot_2014  <- ggplot(data = subset(vdem_subset_v13_fh, vdem_subset_v13_fh$year == 2014), aes(x=academic_freedom_FH, y = v2xca_academ)) +
  geom_point() +
  geom_smooth(method='lm', formula= y~x) +
  xlim(0,1)+
  ylim(c(0,1))+
  labs(x = "FH Academic Freedom (normalized)", 
       y = "V-Dem Academic Freedom Index", 
       title = "2014") +
  theme_pubr()

plot_2015  <- ggplot(data = subset(vdem_subset_v13_fh, vdem_subset_v13_fh$year == 2015), aes(x=academic_freedom_FH, y = v2xca_academ)) +
  geom_point() +
  geom_smooth(method='lm', formula= y~x) +
  xlim(0,1)+
  ylim(c(0,1))+
  labs(x = "FH Academic Freedom (normalized)", 
       y = "V-Dem Academic Freedom Index", 
       title = "2015") +
  theme_pubr()

plot_2016  <- ggplot(data = subset(vdem_subset_v13_fh, vdem_subset_v13_fh$year == 2016), aes(x=academic_freedom_FH, y = v2xca_academ)) +
  geom_point() +
  geom_smooth(method='lm', formula= y~x) +
  xlim(0,1)+
  ylim(c(0,1))+
  labs(x = "FH Academic Freedom (normalized)", 
       y = "V-Dem Academic Freedom Index", 
       title = "2016") +
  theme_pubr()

plot_2017  <- ggplot(data = subset(vdem_subset_v13_fh, vdem_subset_v13_fh$year == 2017), aes(x=academic_freedom_FH, y = v2xca_academ)) +
  geom_point() +
  geom_smooth(method='lm', formula= y~x) +
  xlim(0,1)+
  ylim(c(0,1))+
  labs(x = "FH Academic Freedom (normalized)", 
       y = "V-Dem Academic Freedom Index", 
       title = "2017") +
  theme_pubr()

plot_2018  <- ggplot(data = subset(vdem_subset_v13_fh, vdem_subset_v13_fh$year == 2018), aes(x=academic_freedom_FH, y = v2xca_academ)) +
  geom_point() +
  geom_smooth(method='lm', formula= y~x) +
  xlim(0,1)+
  ylim(c(0,1))+
  labs(x = "FH Academic Freedom (normalized)", 
       y = "V-Dem Academic Freedom Index", 
       title = "2018") +
  theme_pubr()

plot_2019  <- ggplot(data = subset(vdem_subset_v13_fh, vdem_subset_v13_fh$year == 2019), aes(x=academic_freedom_FH, y = v2xca_academ)) +
  geom_point() +
  geom_smooth(method='lm', formula= y~x) +
  xlim(0,1)+
  ylim(c(0,1))+
  labs(x = "FH Academic Freedom (normalized)", 
       y = "V-Dem Academic Freedom Index", 
       title = "2019") +
  theme_pubr()


plot_2020  <- ggplot(data = subset(vdem_subset_v13_fh, vdem_subset_v13_fh$year == 2020), aes(x=academic_freedom_FH, y = v2xca_academ)) +
  geom_point() +
  geom_smooth(method='lm', formula= y~x) +
  xlim(0,1)+
  ylim(c(0,1))+
  labs(x = "FH Academic Freedom (normalized)", 
       y = "V-Dem Academic Freedom Index", 
       title = "2020") +
  theme_pubr()

plot_2021  <- ggplot(data = subset(vdem_subset_v13_fh, vdem_subset_v13_fh$year == 2021), aes(x=academic_freedom_FH, y = v2xca_academ)) +
  geom_point() +
  geom_smooth(method='lm', formula= y~x) +
  xlim(0,1)+
  ylim(c(0,1))+
  labs(x = "FH Academic Freedom (normalized)", 
       y = "V-Dem Academic Freedom Index", 
       title = "2021") +
  theme_pubr()

plot_2022  <- ggplot(data = subset(vdem_subset_v13_fh, vdem_subset_v13_fh$year == 2022), aes(x=academic_freedom_FH, y = v2xca_academ)) +
  geom_point() +
  geom_smooth(method='lm', formula= y~x) +
  xlim(0,1)+
  ylim(c(0,1))+
  labs(x = "FH Academic Freedom (normalized)", 
       y = "V-Dem Academic Freedom Index", 
       title = "2022") +
  theme_pubr()

ggarrange(plot_2012, plot_2013, plot_2014, 
          plot_2015, plot_2016, plot_2017, plot_2018, plot_2019, plot_2020, plot_2021, plot_2022, 
          ncol = 3, nrow = 4)

ggsave("outputs_original_data/Figure_F1.pdf", width = 25,
       height = 35,
       units = c("cm"), dpi = 1000)
ggsave("outputs_original_data/Figure_F1.png", width = 25,
       height = 35,
       units = c("cm"), dpi = 1000)



## for all years at once ##

plot_all_years  <- ggplot(data =vdem_subset_v13_fh, aes(x=academic_freedom_FH, y = v2xca_academ, label = str_c(country_name, year, sep = " "))) +
  geom_jitter() +
  geom_smooth(method='lm', formula= y~x) +
  geom_text_repel(data = subset(vdem_subset_v13_fh, academic_freedom_FH<=0.25 & v2xca_academ >=0.7)) +
  geom_text_repel(data = subset(vdem_subset_v13_fh, academic_freedom_FH<=0.00 & v2xca_academ >=0.3)) +
  geom_text_repel(data = subset(vdem_subset_v13_fh, academic_freedom_FH==1 & v2xca_academ <=0.5)) +
  geom_text_repel(data = subset(vdem_subset_v13_fh, academic_freedom_FH==0.75 & v2xca_academ <=0.4)) +
  
  xlim(0,1)+
  ylim(c(0,1))+
  labs(x = "FH Academic Freedom (normalized)", 
       y = "V-Dem Academic Freedom Index", 
       title = "") +
  theme_pubr()

corr_coef <- stats::cor(vdem_subset_v13_fh$academic_freedom_FH , vdem_subset_v13_fh$v2xca_academ,use="complete.obs")
print(corr_coef)

plot_all_years
ggsave("outputs_original_data/Figure_6.pdf", width = 20,
       height = 20,
       units = c("cm"), dpi = 1000)
ggsave("outputs_original_data/Figure_6.png", width = 20,
       height = 20,
       units = c("cm"), dpi = 1000)


#---------------------------------------------------#
#--respondent characteristics and FH Measure of AF--#
#---------------------------------------------------#

# level of analysis: coder-level dataset 
# for each indicator: separate regression analysis of residuals between Freedom House 
# Academic Freedom indicator and  each indicator of the AFI 
# + regression analysis of residuals from regression between AF Index and FH's indicator


## prepare Freedom House data ##

freedomhouse_data <- read_excel("data/freedom_house/All_data_FIW_2013-2023.xlsx", sheet = 2, col_names = TRUE) 
freedomhouse_data_varnames <- as.character(freedomhouse_data[1,])

freedomhouse_data <- freedomhouse_data %>%
  rename_all(~freedomhouse_data_varnames) 
freedomhouse_data <- freedomhouse_data[-1,]

freedomhouse_data_subset <- freedomhouse_data %>%
  dplyr::select(`Country/Territory`, Edition, Region, D3) %>%
  rename(country = `Country/Territory`, 
         academic_freedom_FH = D3) %>%
  mutate(year = as.numeric(Edition) -1)

rm(freedomhouse_data_varnames, freedomhouse_data)

min(freedomhouse_data_subset$year)

## subset coder_level_ds ##


#load coder-level data 
coder_level_ds <- read.csv(file.path("data/vdem_v13",  "vdem_13_coder_level", "coder_level_ds_v13.csv"))
gc()

summary(coder_characteristics_wide)

coder_level_ds <- coder_level_ds %>%
  left_join(coder_characteristics_wide, by = ("coder_id"))

coder_level_ds <- coder_level_ds %>%
  mutate(reside_in_country = ifelse(country_id == reside, 1, 
                                    ifelse(country_id != reside, 0, NA)))


coder_level_subset <- coder_level_ds %>%
  dplyr::select(country_text_id, country_id, historical_date, coder_id, 
                starts_with(c("v2cafres", "v2cafexch", "v2cainsaut", "v2casurv", "v2clacfree")), 
                starts_with("v2zz"), gender, age_block, phd, government_employment, reside_in_country) 

summary(coder_level_subset)

## Calculate standard deviation of coder's disagreement ##

temp_i <- coder_level_ds %>%
  mutate(historical_date = ymd(historical_date)) %>%
  group_by(country_id, historical_date) %>%
  dplyr::summarize(sd_v2cafres = sd(v2cafres, na.rm = TRUE), 
            n_v2cafres = sum(!is.na(v2cafres)), 
            sd_v2cafexch = sd(v2cafexch, na.rm = TRUE), 
            n_v2cafexch =  sum(!is.na(v2cafexch)),
            sd_v2cainsaut = sd(v2cainsaut, na.rm = TRUE), 
            n_v2cainsaut =  sum(!is.na(v2cainsaut)),
            sd_v2casurv = sd(v2casurv, na.rm = TRUE), 
            n_v2casurv =  sum(!is.na(v2casurv)),
            sd_v2clacfree = sd(v2clacfree, na.rm = TRUE), 
            n_v2clacfree =  sum(!is.na(v2clacfree)))

number_of_coders <- coder_level_ds %>%
  mutate(historical_date = ymd(historical_date)) %>%
  group_by(country_id, historical_date) %>%
  dplyr::summarize(number_coders = n_distinct(coder_id))

temp_i <- temp_i %>%
  left_join(number_of_coders, by = c("country_id", "historical_date"))

rm(number_of_coders)


vdem_subset_v13 <- vdem_full_v13 %>%
  mutate(historical_date = ymd(historical_date)) %>%
  dplyr::select(country_id, country_name, year, historical_date, 
                v2x_polyarchy, v2x_liberal, v2x_libdem, e_regionpol_6C,e_gdppc, e_pop, e_cow_exports,e_cow_imports, v2x_freexp) 

## merge deviations temp_i in coder_level data 

coder_level_subset$historical_date <-  ymd(coder_level_subset$historical_date)

coder_level_subset <- coder_level_subset %>%
  left_join(temp_i, by = c("country_id", "historical_date"))

## merge onto country-level data 

coder_level_subset <- coder_level_subset %>%
  left_join(vdem_subset_v13, by = c("country_id", "historical_date"))
  
## prepare freedom house to left join cs_dataset ##

freedomhouse_data_subset$country_id <- countrycode(freedomhouse_data_subset$country, origin = "country.name", destination =  "vdem" )
freedomhouse_data_subset$year <- as.double(freedomhouse_data_subset$year)

coder_level_subset <- coder_level_subset %>%
  left_join(freedomhouse_data_subset, by = c("country_id", "year"))

coder_level_subset$academic_freedom_FH <- as.double(coder_level_subset$academic_freedom_FH)

summary(coder_level_subset$academic_freedom_FH)

summary(coder_level_subset)

## calculate mean country characteristics per country-date ##

summary(coder_level_ds$gender)
summary(coder_level_ds$government_employment)


temp_mean_char <- coder_level_ds %>%
  mutate(historical_date = ymd(historical_date), 
         year = year(historical_date)) %>%
  filter(year >=2012) %>%
  group_by(country_id, historical_date) %>%
  dplyr::summarize(agender = mean(gender, na.rm = TRUE), 
            aageblock = mean(age_block, na.rm = TRUE), 
            aphd = mean(phd, na.rm = TRUE), 
            agov_employ = mean(government_employment, na.rm = TRUE), 
            areside = mean(reside_in_country, na.rm = TRUE), 
            afremrk = mean(v2zzfremrk, na.rm = TRUE), 
            aelcdem = mean(v2zzelcdem, na.rm = TRUE), 
            alibdem = mean(v2zzlibdem, na.rm = TRUE))

coder_level_subset <- coder_level_subset %>%
  left_join(temp_mean_char, by = c("country_id", "historical_date"))

coder_level_subset <- coder_level_subset %>%
  mutate(year = year(historical_date)) %>%
  filter(year >=2012)

summary(coder_level_subset)

coder_level_subset$age_block <- as.character(coder_level_subset$age_block)

## Datasets for each Model ##

#clacfree
coder_level_m1 <- coder_level_subset %>%
  drop_na(v2clacfree, academic_freedom_FH, country_id, year, gender, agender, age_block, aageblock,
          phd, aphd, government_employment, agov_employ, reside_in_country, areside, v2zzfremrk, afremrk, v2zzelcdem, 
          aelcdem, v2zzlibdem, alibdem)
#v2cafres
coder_level_m2 <- coder_level_subset %>%
  drop_na(v2cafres, academic_freedom_FH, country_id, year, gender, agender, age_block, aageblock,
          phd, aphd, government_employment, agov_employ, reside_in_country, areside, v2zzfremrk, afremrk, v2zzelcdem, 
          aelcdem, v2zzlibdem, alibdem)

#v2cafexch
coder_level_m3 <- coder_level_subset %>%
  drop_na(v2cafexch, academic_freedom_FH, country_id, year, gender, agender, age_block, aageblock,
          phd, aphd, government_employment, agov_employ, reside_in_country, areside, v2zzfremrk, afremrk, v2zzelcdem, 
          aelcdem, v2zzlibdem, alibdem)

#v2cainsaut
coder_level_m4 <- coder_level_subset %>%
  drop_na(v2cainsaut, academic_freedom_FH, country_id, year, gender, agender, age_block, aageblock,
          phd, aphd, government_employment, agov_employ, reside_in_country, areside, v2zzfremrk, afremrk, v2zzelcdem, 
          aelcdem, v2zzlibdem, alibdem)

#v2casurv
coder_level_m5 <- coder_level_subset %>%
  drop_na(v2casurv, academic_freedom_FH, country_id, year, gender, agender, age_block, aageblock,
          phd, aphd, government_employment, agov_employ, reside_in_country, areside, v2zzfremrk, afremrk, v2zzelcdem, 
          aelcdem, v2zzlibdem, alibdem)

#### Model 1: v2clacfree ####

## calculate residuals ##
m1_res <- lm(v2clacfree ~ academic_freedom_FH + as.factor(year), data = coder_level_m1)

coder_level_m1$residuals <- m1_res$residuals
summary(coder_level_m1$residuals)

m1 <- lm_robust(residuals ~ gender + agender + age_block + aageblock + phd + aphd + government_employment + agov_employ + 
                  reside_in_country + areside + v2zzfremrk + afremrk + v2zzelcdem + aelcdem + v2zzlibdem + alibdem + 
                  sd_v2clacfree + n_v2clacfree +
                  as.factor(year),
                clusters = country_id, 
                data = coder_level_m1)
summary(m1)


#### Model 2: v2cafres ####

m2_res <- lm(v2cafres ~ academic_freedom_FH + as.factor(year), data = coder_level_m2)

coder_level_m2$residuals <- m2_res$residuals

m2 <- lm_robust(residuals ~ gender + agender + age_block + aageblock + phd + aphd + government_employment + agov_employ + 
                  reside_in_country + areside + v2zzfremrk + afremrk + v2zzelcdem + aelcdem + v2zzlibdem + alibdem + 
                  sd_v2cafres + n_v2cafres +
                  as.factor(year),
                clusters = country_id, 
                data = coder_level_m2)
summary(m2)

#### Model 3: v2cafexch ####

m3_res <- lm(v2cafexch ~ academic_freedom_FH + as.factor(year), data = coder_level_m3)

coder_level_m3$residuals <- m3_res$residuals

m3 <- lm_robust(residuals ~ gender + agender + age_block + aageblock + phd + aphd + government_employment + agov_employ + 
                  reside_in_country + areside + v2zzfremrk + afremrk + v2zzelcdem + aelcdem + v2zzlibdem + alibdem + 
                  sd_v2cafexch + n_v2cafexch +
                  as.factor(year),
                clusters = country_id, 
                data = coder_level_m3)
summary(m3)


#### Model 4: v2cainsaut ####

m4_res <- lm(v2cainsaut ~ academic_freedom_FH + as.factor(year), data = coder_level_m4)

coder_level_m4$residuals <- m4_res$residuals

m4 <- lm_robust(residuals ~ gender + agender + age_block + aageblock + phd + aphd + government_employment + agov_employ + 
                  reside_in_country + areside + v2zzfremrk + afremrk + v2zzelcdem + aelcdem + v2zzlibdem + alibdem + 
                  sd_v2cainsaut + n_v2cainsaut +
                  as.factor(year),
                clusters = country_id, 
                data = coder_level_m4)
summary(m4)

#### Model 5: v2casurv ####

m5_res <- lm(v2casurv ~ academic_freedom_FH + as.factor(year), data = coder_level_m5)

coder_level_m5$residuals <- m5_res$residuals

m5 <- lm_robust(residuals ~ gender + agender + age_block + aageblock + phd + aphd + government_employment + agov_employ + 
                  reside_in_country + areside + v2zzfremrk + afremrk + v2zzelcdem + aelcdem + v2zzlibdem + alibdem + 
                  sd_v2casurv + n_v2casurv +
                  as.factor(year),
                clusters = country_id, 
                data = coder_level_m5)
summary(m5)

#### Pooled Analysis ####

coder_level_subset_pooled <- coder_level_subset %>%
  dplyr::select(country_text_id, country_id, coder_id, historical_date, year, academic_freedom_FH, country_id, year, 
                gender, agender, age_block, aageblock,
                phd, aphd, government_employment, agov_employ, reside_in_country, areside, v2zzfremrk, afremrk, v2zzelcdem, 
                aelcdem, v2zzlibdem, alibdem, number_coders, v2cafres, v2cafexch, v2cainsaut, v2casurv, v2clacfree) %>%
  pivot_longer(!c(country_text_id, country_id, coder_id, historical_date, year, academic_freedom_FH, country_id, year, 
                  gender, agender, age_block, aageblock,
                  phd, aphd, government_employment, agov_employ, reside_in_country, areside, v2zzfremrk, afremrk, v2zzelcdem, 
                  aelcdem, v2zzlibdem, alibdem, number_coders), 
               names_to = "indicator", values_to = "coder_value_indicator") %>%
  drop_na(coder_value_indicator, academic_freedom_FH)


coder_level_subset_pooled_sd <- coder_level_subset %>%
  dplyr::select(country_text_id, country_id, coder_id, historical_date, year, sd_v2cafres, sd_v2cafexch, sd_v2cainsaut, 
                sd_v2casurv, sd_v2clacfree) %>%
  pivot_longer(!c(country_text_id, country_id, coder_id, historical_date, year), 
               names_to = "indicator", values_to = "sd_indicator") %>%
  drop_na(sd_indicator) %>%
  mutate(indicator = case_when(indicator ==  "sd_v2cafres" ~ "v2cafres", 
                               indicator ==  "sd_v2cafexch" ~ "v2cafexch", 
                               indicator ==  "sd_v2cainsaut" ~ "v2cainsaut", 
                               indicator ==  "sd_v2casurv" ~ "v2casurv", 
                               indicator ==  "sd_v2clacfree" ~ "v2clacfree"))
  
coder_level_subset_pooled <- coder_level_subset_pooled %>%
  left_join(coder_level_subset_pooled_sd, by = c("country_text_id", "country_id", "coder_id", "historical_date", "year", "indicator")) %>%
  drop_na()


#### Model 6: pooled model ####

m6_res <- lm(coder_value_indicator ~ academic_freedom_FH + as.factor(year), data = coder_level_subset_pooled)

coder_level_subset_pooled$residuals <- m6_res$residuals

m6 <- lm_robust(residuals ~ gender + agender + age_block + aageblock + phd + aphd + government_employment + agov_employ + 
                  reside_in_country + areside + v2zzfremrk + afremrk + v2zzelcdem + aelcdem + v2zzlibdem + alibdem + 
                  sd_indicator +  number_coders +
                  as.factor(year) + indicator,
                clusters = country_id, 
                data = coder_level_subset_pooled)
summary(m6)

## Table F1 Supplementary Appendix ##
texreg(list(m1, m2, m3, m4, m5, m6), 
       head.tag = TRUE, body.tag = TRUE,
       #file = "outputs_original_data/Table_F1.tex",
       digits = 3,
       omit.coef=c('year'), 
       custom.coef.names = c("Intercept",
                             "Respondent's Gender", 
                             "Share female respondents",
                             "Respondent's Age: Middle Cohort", 
                             "Respondent's Age: Oldest Cohort", 
                             "Average Age of respondents", 
                             "Respondent: PhD education", 
                             "Share of respondents with PhD", 
                             "Respondent employed by government", 
                             "Share respondents employed by government", 
                             "Respondent: Reside in country", 
                             "Share respondent reside in country", 
                             "Respondent supports freemarket", 
                             "Average support for free market among respondents",
                             "Respondent supports electoral democracy", 
                             "Average support for electoral democracy among respondents",
                             "Respondent supports liberal democracy", 
                             "Average support for liberal democracy among respondents",
                             "Respondent Disagreement", 
                             "Number of respondents", 
                             "Respondent Disagreement", 
                             "Number of respondents", 
                             "Respondent Disagreement", 
                             "Number of respondents", 
                             "Respondent Disagreement", 
                             "Number of respondents", 
                             "Respondent Disagreement", 
                             "Number of respondents", 
                             "Respondent Disagreement", 
                             "Number of respondents", 
                             "Dummy Freedom to research and teach", 
                             "Dummy Institutional Autonomy", 
                             "Dummy Campus Integrity", 
                             "Dummy Freedom of academic and cultural expression"),
       booktabs = TRUE,
       use.packages = FALSE,
       caption = "Linear Models predicting respondents rating with country characteristics",
       fontsize = "scriptsize",
       stars = c(0.001, 0.01, 0.05), 
       label = "Table:F1", 
       longtable = TRUE)


#### Figure 7: Coefficient Plot Pooled Model ####


# Specify the width of your confidence intervals
interval1 <- -qnorm((1-0.9)/2)  # 90% multiplier
interval2 <- -qnorm((1-0.95)/2)  # 95% multiplier

# Put model estimates into temporary data.frames:
model6Frame <- data.frame(term = m6$term,
                          estimate = m6$coefficients,
                          std.error = m6$std.error,
                          conf.low = m6$conf.low,
                          conf.high = m6$conf.high,
                          modelName = "Pooled Model") %>%
  by_2sd(coder_level_subset_pooled) %>%
  mutate(conf.low = estimate - std.error*interval2, 
         conf.high = estimate + std.error*interval2)



# Combine these data.frames
model6Frame <- model6Frame %>%
  filter(!str_detect(term, "(year)")) %>%
  filter(!str_detect(term, "(Intercept)")) %>%
  filter(!str_detect(term, "indicatorv")) %>%
  filter(term %in% c("agender", "aageblock", "aphd", "agov_employ", "areside", "afremrk", "aelcdem", "alibdem",
                     "sd_indicator", "number_coders")) %>%
  mutate(term = case_when(term == "agender" ~ "Share female respondents", 
                          term == "aageblock" ~ "Average age of respondents", 
                          term == "aphd"~  "Share respondents with PhD",
                          term == "agov_employ" ~ "Share respondents employed by government", 
                          term == "areside" ~ "Share respondents residing in country", 
                          term == "afremrk" ~ "Average support for free market among respondents", 
                          term == "aelcdem" ~ "Average support for electoral democracy among respondents",
                          term == "alibdem" ~ "Average support for liberal democracy among respondents", 
                          term == "sd_indicator" ~ "Respondent disagreement", 
                          term == "number_coders" ~ "Number of respondents"))



# Plot
ggplot(model6Frame, aes( x = factor(term, levels=c("Share female respondents", 
                                                   "Average age of respondents", 
                                                   "Share respondents with PhD",
                                                   "Share respondents employed by government", 
                                                   "Share respondents residing in country", 
                                                   "Average support for free market among respondents", 
                                                   "Average support for electoral democracy among respondents",
                                                   "Average support for liberal democracy among respondents", 
                                                   "Respondent disagreement", 
                                                   "Number of respondents")))) +
  geom_hline(yintercept = 0, colour = gray(1/2), lty = 2) +
  geom_linerange(aes(ymin = estimate - std.error*interval1,
                     ymax = estimate + std.error*interval1),
                 lwd = 1, position = position_dodge(width = 2/3)) +
  geom_pointrange(aes(y = estimate, ymin = estimate - std.error*interval2,
                      ymax = estimate + std.error*interval2,),
                  lwd = 1/2, position = position_dodge(width = 2/3),
                  fill = "WHITE") +
  coord_flip() + theme_bw() +
  ggtitle("") +
  labs(color= "Models", 
       shape= "Models",
       x = "", 
       y = "Estimate") + 
  theme(legend.position="bottom") 


## Figure 7 ##

ggsave("outputs_original_data/Figure_7.pdf", width = 20,
       height = 15,
       units = c("cm"), dpi = 1000)
ggsave("outputs_original_data/Figure_7.png", width = 20,
       height = 15,
       units = c("cm"), dpi = 1000)

#### Figure F2: Coefficient Plot ####

# Put model estimates into temporary data.frames:
model1Frame <- data.frame(term = m1$term,
                          estimate = m1$coefficients,
                          std.error = m1$std.error,
                          conf.low = m1$conf.low,
                          cond.high = m1$conf.high,
                          modelName = "Freedom to research\nand teach") %>%
  by_2sd(coder_level_m1)


model2Frame <- data.frame(term = m2$term,
                          estimate = m2$coefficients,
                          std.error = m2$std.error,
                          conf.low = m2$conf.low,
                          cond.high = m2$conf.high,
                          modelName = "Freedom of academic\nexchange and dissemination") %>%
  by_2sd(coder_level_m2)


model3Frame <- data.frame(term = m3$term,
                          estimate = m3$coefficients,
                          std.error = m3$std.error,
                          conf.low = m3$conf.low,
                          cond.high = m3$conf.high,
                          modelName = " Institutional\nautonomy") %>%
  by_2sd(coder_level_m3)

model4Frame <- data.frame(term = m4$term,
                          estimate = m4$coefficients,
                          std.error = m4$std.error,
                          conf.low = m4$conf.low,
                          cond.high = m4$conf.high,
                          modelName = "Campus\nintegrity") %>%
  by_2sd(coder_level_m4)

model5Frame <- data.frame(term = m5$term,
                          estimate = m5$coefficients,
                          std.error = m5$std.error,
                          conf.low = m5$conf.low,
                          cond.high = m5$conf.high,
                          modelName = "Freedom of academic\nand cultural expression") %>%
  by_2sd(coder_level_m5)

# Combine these data.frames
allModelFrame <- data.frame(rbind(model1Frame, model2Frame, model3Frame, model4Frame, model5Frame))  

allModelFrame <- allModelFrame %>%
  filter(!str_detect(term, "year|Intercept")) %>%
  filter(term %in% c("agender", "aageblock", "aphd", "agov_employ", "areside", "afremrk", "aelecdem", "alibdem",
                     "sd_indicator", "number_coders", "sd_v2clacfree", "n_v2clacfree", 
                     "sd_v2cafres", "n_v2cafres", "sd_v2cafexch","n_v2cafexch", "sd_v2cainsaut", "n_v2cainsaut", 
                     "sd_v2casurv", "n_v2casurv")) %>%
  mutate(term = case_when(term == "agender" ~ "Share female respondents", 
                          term == "aageblock" ~ "Average age of respondents", 
                          term == "aphd"~  "Share respondents with PhD",
                          term == "agov_employ" ~ "Share respondents employed by government", 
                          term == "areside" ~ "Share respondents residing in country", 
                          term == "afremrk" ~ "Average support for free market among respondents", 
                          term == "aelecdem" ~ "Average support for electoral democracy among respondents",
                          term == "alibdem" ~ "Average support for liberal democracy among respondents", 
                          term == "sd_indicator" ~ "Respondent disagreement", 
                          term == "number_coders" ~ "Number of respondents",
                          term == "sd_v2clacfree" ~ "Number of respondents", 
                          term == "n_v2clacfree" ~ "Respondent disagreement", 
                          term == "sd_v2cafres" ~ "Number of respondents", 
                          term == "n_v2cafres" ~ "Respondent disagreement", 
                          term == "sd_v2cafexch" ~ "Number of respondents", 
                          term == "n_v2cafexch" ~ "Respondent disagreement", 
                          term == "sd_v2cainsaut" ~ "Number of respondents", 
                          term == "n_v2cainsaut" ~ "Respondent disagreement", 
                          term == "sd_v2casurv" ~ "Number of respondents", 
                          term == "n_v2casurv" ~ "Respondent disagreement"))


# Specify the width of your confidence intervals
interval1 <- -qnorm((1-0.9)/2)  # 90% multiplier
interval2 <- -qnorm((1-0.95)/2)  # 95% multiplier

# Plot
zp1 <- ggplot(allModelFrame, aes(colour = modelName, x = factor(term,levels=c("Share female respondents", 
                                                                              "Average age of respondents", 
                                                                              "Share respondents with PhD",
                                                                              "Share respondents employed by government", 
                                                                              "Share respondents residing in country", 
                                                                              "Average support for free market among respondents", 
                                                                              "Average support for electoral democracy among respondents",
                                                                              "Average support for liberal democracy among respondents", 
                                                                              "Respondent disagreement", 
                                                                              "Number of respondents")))) +
  geom_hline(yintercept = 0, colour = gray(1/2), lty = 2) +
  geom_linerange(aes(ymin = estimate - std.error*interval1,
                     ymax = estimate + std.error*interval1),
                 lwd = 1, position = position_dodge(width = 2/3)) +
  geom_pointrange(aes(y = estimate, ymin = estimate - std.error*interval2,
                      ymax = estimate + std.error*interval2,),
                  lwd = 1/2, position = position_dodge(width = 2/3),
                  fill = "WHITE") +
  coord_flip() + theme_bw() +
  ggtitle("") +
  labs(color= "Models", 
       shape= "Models",
       x = "") + 
  theme(legend.position="bottom") +
  scale_color_viridis(discrete = TRUE, option = "D")

print(zp1)  # The trick to these is position_dodge().

## Figure F2 ##

ggsave("outputs_original_data/Figure_F2.pdf", width = 20,
       height = 25,
       units = c("cm"), dpi = 1000)
ggsave("outputs_original_data/Figure_F2.png", width = 20,
       height = 25,
       units = c("cm"), dpi = 1000)

